library(here)
library(dplyr)
library(arrow)
census_dataset <- open_dataset(here("data", "processed", "parquet_data_coords"))
census_dataset
FileSystemDataset with 33 Parquet files
ENTIDAD: string
MUN: string
NOM_MUN: string
LOC: string
NOM_LOC: string
LONGITUD: string
LATITUD: string
ALTITUD: string
POBTOT: double
POBFEM: string
POBMAS: string
REL_H_M: string
POB0_14: string
POB15_64: string
POB65_MAS: string
P_0A4: string
P_0A4_F: string
P_0A4_M: string
P_5A9: string
P_5A9_F: string
P_5A9_M: string
P_10A14: string
P_10A14_F: string
P_10A14_M: string
P_15A19: string
P_15A19_F: string
P_15A19_M: string
P_20A24: string
P_20A24_F: string
P_20A24_M: string
P_25A29: string
P_25A29_F: string
P_25A29_M: string
P_30A34: string
P_30A34_F: string
P_30A34_M: string
P_35A39: string
P_35A39_F: string
P_35A39_M: string
P_40A44: string
P_40A44_F: string
P_40A44_M: string
P_45A49: string
P_45A49_F: string
P_45A49_M: string
P_50A54: string
P_50A54_F: string
P_50A54_M: string
P_55A59: string
P_55A59_F: string
P_55A59_M: string
P_60A64: string
P_60A64_F: string
P_60A64_M: string
P_65A69: string
P_65A69_F: string
P_65A69_M: string
P_70A74: string
P_70A74_F: string
P_70A74_M: string
P_75A79: string
P_75A79_F: string
P_75A79_M: string
P_80A84: string
P_80A84_F: string
P_80A84_M: string
P_85YMAS: string
P_85YMAS_F: string
P_85YMAS_M: string
PROM_HNV: string
PNACENT: string
PNACENT_F: string
PNACENT_M: string
PNACOE: string
PNACOE_F: string
PNACOE_M: string
PRES2015: string
PRES2015_F: string
PRES2015_M: string
PRESOE15: string
PRESOE15_F: string
PRESOE15_M: string
P3YM_HLI: string
P3YM_HLI_F: string
P3YM_HLI_M: string
P3HLINHE: string
P3HLINHE_F: string
P3HLINHE_M: string
P3HLI_HE: string
P3HLI_HE_F: string
P3HLI_HE_M: string
PHOG_IND: string
POB_AFRO: string
POB_AFRO_F: string
POB_AFRO_M: string
PCON_DISC: string
PCON_LIMI: string
PSIND_LIM: string
P3A5_NOA: string
P3A5_NOA_F: string
P3A5_NOA_M: string
P6A11_NOA: string
P6A11_NOAF: string
P6A11_NOAM: string
P12A14NOA: string
P12A14NOAF: string
P12A14NOAM: string
P15A17A: string
P15A17A_F: string
P15A17A_M: string
P18A24A: string
P18A24A_F: string
P18A24A_M: string
P8A14AN: string
P8A14AN_F: string
P8A14AN_M: string
P15YM_AN: string
P15YM_AN_F: string
P15YM_AN_M: string
P15YM_SE: string
P15YM_SE_F: string
P15YM_SE_M: string
P15PRI_IN: string
P15PRI_INF: string
P15PRI_INM: string
P15PRI_CO: string
P15PRI_COF: string
P15PRI_COM: string
P15SEC_IN: string
P15SEC_INF: string
P15SEC_INM: string
P15SEC_CO: string
P15SEC_COF: string
P15SEC_COM: string
P18YM_PB: string
P18YM_PB_F: string
P18YM_PB_M: string
GRAPROES: string
GRAPROES_F: string
GRAPROES_M: string
PEA: string
PEA_F: string
PEA_M: string
PE_INAC: string
PE_INAC_F: string
PE_INAC_M: string
POCUPADA: string
POCUPADA_F: string
POCUPADA_M: string
PDESOCUP: string
PDESOCUP_F: string
PDESOCUP_M: string
PSINDER: string
PDER_SS: string
P12YM_SOLT: string
P12YM_CASA: string
P12YM_SEPA: string
PCATOLICA: string
PRO_CRIEVA: string
POTRAS_REL: string
PSIN_RELIG: string
TOTHOG: string
HOGJEF_F: string
HOGJEF_M: string
POBHOG: string
PHOGJEF_F: string
PHOGJEF_M: string
longitude_decimal: double
latitude_decimal: double
NOM_ENT: string
See $metadata for additional Schema metadata
pueb_norm <- census_dataset |>
filter(NOM_ENT=="Puebla") |>
collect()
Warning: Invalid metadata$rWarning: Invalid metadata$rWarning: Invalid metadata$rWarning: Invalid metadata$r
pueb_norm
unique(pueb_norm$NOM_MUN)
[1] "Total de la entidad Puebla" "Acajete" "Acateno" "Acatlán"
[5] "Acatzingo" "Acteopan" "Ahuacatlán" "Ahuatlán"
[9] "Ahuazotepec" "Ahuehuetitla" "Ajalpan" "Albino Zertuche"
[13] "Aljojuca" "Altepexi" "Amixtlán" "Amozoc"
[17] "Aquixtla" "Atempan" "Atexcal" "Atlixco"
[21] "Atoyatempan" "Atzala" "Atzitzihuacán" "Atzitzintla"
[25] "Axutla" "Ayotoxco de Guerrero" "Calpan" "Caltepec"
[29] "Camocuautla" "Caxhuacan" "Coatepec" "Coatzingo"
[33] "Cohetzala" "Cohuecan" "Coronango" "Coxcatlán"
[37] "Coyomeapan" "Coyotepec" "Cuapiaxtla de Madero" "Cuautempan"
[41] "Cuautinchán" "Cuautlancingo" "Cuayuca de Andrade" "Cuetzalan del Progreso"
[45] "Cuyoaco" "Chalchicomula de Sesma" "Chapulco" "Chiautla"
[49] "Chiautzingo" "Chiconcuautla" "Chichiquila" "Chietla"
[53] "Chigmecatitlán" "Chignahuapan" "Chignautla" "Chila"
[57] "Chila de la Sal" "Honey" "Chilchotla" "Chinantla"
[61] "Domingo Arenas" "Eloxochitlán" "Epatlán" "Esperanza"
[65] "Francisco Z. Mena" "General Felipe Ángeles" "Guadalupe" "Guadalupe Victoria"
[69] "Hermenegildo Galeana" "Huaquechula" "Huatlatlauca" "Huauchinango"
[73] "Huehuetla" "Huehuetlán el Chico" "Huejotzingo" "Hueyapan"
[77] "Hueytamalco" "Hueytlalpan" "Huitzilan de Serdán" "Huitziltepec"
[81] "Atlequizayan" "Ixcamilpa de Guerrero" "Ixcaquixtla" "Ixtacamaxtitlán"
[85] "Ixtepec" "Izúcar de Matamoros" "Jalpan" "Jolalpan"
[89] "Jonotla" "Jopala" "Juan C. Bonilla" "Juan Galindo"
[93] "Juan N. Méndez" "Lafragua" "Libres" "La Magdalena Tlatlauquitepec"
[97] "Mazapiltepec de Juárez" "Mixtla" "Molcaxac" "Cañada Morelos"
[101] "Naupan" "Nauzontla" "Nealtican" "Nicolás Bravo"
[105] "Nopalucan" "Ocotepec" "Ocoyucan" "Olintla"
[109] "Oriental" "Pahuatlán" "Palmar de Bravo" "Pantepec"
[113] "Petlalcingo" "Piaxtla" "Puebla" "Quecholac"
[117] "Quimixtlán" "Rafael Lara Grajales" "Los Reyes de Juárez" "San Andrés Cholula"
[121] "San Antonio Cañada" "San Diego la Mesa Tochimiltzingo" "San Felipe Teotlalcingo" "San Felipe Tepatlán"
[125] "San Gabriel Chilac" "San Gregorio Atzompa" "San Jerónimo Tecuanipan" "San Jerónimo Xayacatlán"
[129] "San José Chiapa" "San José Miahuatlán" "San Juan Atenco" "San Juan Atzompa"
[133] "San Martín Texmelucan" "San Martín Totoltepec" "San Matías Tlalancaleca" "San Miguel Ixitlán"
[137] "San Miguel Xoxtla" "San Nicolás Buenos Aires" "San Nicolás de los Ranchos" "San Pablo Anicano"
[141] "San Pedro Cholula" "San Pedro Yeloixtlahuaca" "San Salvador el Seco" "San Salvador el Verde"
[145] "San Salvador Huixcolotla" "San Sebastián Tlacotepec" "Santa Catarina Tlaltempan" "Santa Inés Ahuatempan"
[149] "Santa Isabel Cholula" "Santiago Miahuatlán" "Huehuetlán el Grande" "Santo Tomás Hueyotlipan"
[153] "Soltepec" "Tecali de Herrera" "Tecamachalco" "Tecomatlán"
[157] "Tehuacán" "Tehuitzingo" "Tenampulco" "Teopantlán"
[161] "Teotlalco" "Tepanco de López" "Tepango de Rodríguez" "Tepatlaxco de Hidalgo"
[165] "Tepeaca" "Tepemaxalco" "Tepeojuma" "Tepetzintla"
[169] "Tepexco" "Tepexi de Rodríguez" "Tepeyahualco" "Tepeyahualco de Cuauhtémoc"
[173] "Tetela de Ocampo" "Teteles de Avila Castillo" "Teziutlán" "Tianguismanalco"
[177] "Tilapa" "Tlacotepec de Benito Juárez" "Tlacuilotepec" "Tlachichuca"
[181] "Tlahuapan" "Tlaltenango" "Tlanepantla" "Tlaola"
[185] "Tlapacoya" "Tlapanalá" "Tlatlauquitepec" "Tlaxco"
[189] "Tochimilco" "Tochtepec" "Totoltepec de Guerrero" "Tulcingo"
[193] "Tuzamapan de Galeana" "Tzicatlacoyan" "Venustiano Carranza" "Vicente Guerrero"
[197] "Xayacatlán de Bravo" "Xicotepec" "Xicotlán" "Xiutetelco"
[201] "Xochiapulco" "Xochiltepec" "Xochitlán de Vicente Suárez" "Xochitlán Todos Santos"
[205] "Yaonáhuac" "Yehualtepec" "Zacapala" "Zacapoaxtla"
[209] "Zacatlán" "Zapotitlán" "Zapotitlán de Méndez" "Zaragoza"
[213] "Zautla" "Zihuateutla" "Zinacatepec" "Zongozotla"
[217] "Zoquiapan" "Zoquitlán"
extract_coordinates <- function(data, municipality, locality) {
selected_location <- data |>
filter(NOM_MUN == municipality, NOM_LOC == locality)
coordinates <- tibble(
long = selected_location$longitude_decimal,
lat = selected_location$latitude_decimal
)
return(coordinates)
}
municipality <- "Acajete"
locality <- "Santa Isabel Tepetzala"
red_point <- extract_coordinates(pueb_norm, municipality, locality) |>
slice(1)
# red_point <- data.frame(long = -98.2035, lat = 19.0414)
filtered_geojson <- geojson_file |>
filter(name == "Puebla")
Error: object 'geojson_file' not found
column_names <- names(pueb_norm)
column_names
[1] "ENTIDAD" "MUN" "NOM_MUN" "LOC" "NOM_LOC" "LONGITUD" "LATITUD"
[8] "ALTITUD" "POBTOT" "POBFEM" "POBMAS" "REL_H_M" "POB0_14" "POB15_64"
[15] "POB65_MAS" "P_0A4" "P_0A4_F" "P_0A4_M" "P_5A9" "P_5A9_F" "P_5A9_M"
[22] "P_10A14" "P_10A14_F" "P_10A14_M" "P_15A19" "P_15A19_F" "P_15A19_M" "P_20A24"
[29] "P_20A24_F" "P_20A24_M" "P_25A29" "P_25A29_F" "P_25A29_M" "P_30A34" "P_30A34_F"
[36] "P_30A34_M" "P_35A39" "P_35A39_F" "P_35A39_M" "P_40A44" "P_40A44_F" "P_40A44_M"
[43] "P_45A49" "P_45A49_F" "P_45A49_M" "P_50A54" "P_50A54_F" "P_50A54_M" "P_55A59"
[50] "P_55A59_F" "P_55A59_M" "P_60A64" "P_60A64_F" "P_60A64_M" "P_65A69" "P_65A69_F"
[57] "P_65A69_M" "P_70A74" "P_70A74_F" "P_70A74_M" "P_75A79" "P_75A79_F" "P_75A79_M"
[64] "P_80A84" "P_80A84_F" "P_80A84_M" "P_85YMAS" "P_85YMAS_F" "P_85YMAS_M" "PROM_HNV"
[71] "PNACENT" "PNACENT_F" "PNACENT_M" "PNACOE" "PNACOE_F" "PNACOE_M" "PRES2015"
[78] "PRES2015_F" "PRES2015_M" "PRESOE15" "PRESOE15_F" "PRESOE15_M" "P3YM_HLI" "P3YM_HLI_F"
[85] "P3YM_HLI_M" "P3HLINHE" "P3HLINHE_F" "P3HLINHE_M" "P3HLI_HE" "P3HLI_HE_F" "P3HLI_HE_M"
[92] "PHOG_IND" "POB_AFRO" "POB_AFRO_F" "POB_AFRO_M" "PCON_DISC" "PCON_LIMI" "PSIND_LIM"
[99] "P3A5_NOA" "P3A5_NOA_F" "P3A5_NOA_M" "P6A11_NOA" "P6A11_NOAF" "P6A11_NOAM" "P12A14NOA"
[106] "P12A14NOAF" "P12A14NOAM" "P15A17A" "P15A17A_F" "P15A17A_M" "P18A24A" "P18A24A_F"
[113] "P18A24A_M" "P8A14AN" "P8A14AN_F" "P8A14AN_M" "P15YM_AN" "P15YM_AN_F" "P15YM_AN_M"
[120] "P15YM_SE" "P15YM_SE_F" "P15YM_SE_M" "P15PRI_IN" "P15PRI_INF" "P15PRI_INM" "P15PRI_CO"
[127] "P15PRI_COF" "P15PRI_COM" "P15SEC_IN" "P15SEC_INF" "P15SEC_INM" "P15SEC_CO" "P15SEC_COF"
[134] "P15SEC_COM" "P18YM_PB" "P18YM_PB_F" "P18YM_PB_M" "GRAPROES" "GRAPROES_F" "GRAPROES_M"
[141] "PEA" "PEA_F" "PEA_M" "PE_INAC" "PE_INAC_F" "PE_INAC_M" "POCUPADA"
[148] "POCUPADA_F" "POCUPADA_M" "PDESOCUP" "PDESOCUP_F" "PDESOCUP_M" "PSINDER" "PDER_SS"
[155] "P12YM_SOLT" "P12YM_CASA" "P12YM_SEPA" "PCATOLICA" "PRO_CRIEVA" "POTRAS_REL" "PSIN_RELIG"
[162] "TOTHOG" "HOGJEF_F" "HOGJEF_M" "POBHOG" "PHOGJEF_F" "PHOGJEF_M" "longitude_decimal"
[169] "latitude_decimal" "NOM_ENT"
matching_columns <- grep("^P_.*[MF]$", column_names, value = TRUE)
matching_columns
[1] "P_0A4_F" "P_0A4_M" "P_5A9_F" "P_5A9_M" "P_10A14_F" "P_10A14_M" "P_15A19_F" "P_15A19_M" "P_20A24_F" "P_20A24_M" "P_25A29_F"
[12] "P_25A29_M" "P_30A34_F" "P_30A34_M" "P_35A39_F" "P_35A39_M" "P_40A44_F" "P_40A44_M" "P_45A49_F" "P_45A49_M" "P_50A54_F" "P_50A54_M"
[23] "P_55A59_F" "P_55A59_M" "P_60A64_F" "P_60A64_M" "P_65A69_F" "P_65A69_M" "P_70A74_F" "P_70A74_M" "P_75A79_F" "P_75A79_M" "P_80A84_F"
[34] "P_80A84_M" "P_85YMAS_F" "P_85YMAS_M"
ending_in_M <- character(0)
ending_in_F <- character(0)
# Iterate over the matching column names and separate into M or F vectors
for (col_name in matching_columns) {
if (endsWith(col_name, "M")) {
ending_in_M <- c(ending_in_M, col_name)
} else if (endsWith(col_name, "F")) {
ending_in_F <- c(ending_in_F, col_name)
}
}
# Print the vectors
print("Column names ending in M:")
[1] "Column names ending in M:"
print(ending_in_M)
[1] "P_0A4_M" "P_5A9_M" "P_10A14_M" "P_15A19_M" "P_20A24_M" "P_25A29_M" "P_30A34_M" "P_35A39_M" "P_40A44_M" "P_45A49_M" "P_50A54_M"
[12] "P_55A59_M" "P_60A64_M" "P_65A69_M" "P_70A74_M" "P_75A79_M" "P_80A84_M" "P_85YMAS_M"
print("Column names ending in F:")
[1] "Column names ending in F:"
print(ending_in_F)
[1] "P_0A4_F" "P_5A9_F" "P_10A14_F" "P_15A19_F" "P_20A24_F" "P_25A29_F" "P_30A34_F" "P_35A39_F" "P_40A44_F" "P_45A49_F" "P_50A54_F"
[12] "P_55A59_F" "P_60A64_F" "P_65A69_F" "P_70A74_F" "P_75A79_F" "P_80A84_F" "P_85YMAS_F"
library(ggplot2)
library(dplyr)
cohort_names_m <- c("P_0A4_M",
"P_5A9_M",
"P_10A14_M",
"P_15A19_M",
"P_20A24_M",
"P_25A29_M",
"P_30A34_M",
"P_35A39_M",
"P_40A44_M",
"P_45A49_M",
"P_50A54_M",
"P_55A59_M",
"P_60A64_M",
"P_65A69_M",
"P_70A74_M",
"P_75A79_M",
"P_80A84_M",
"P_85YMAS_M")
cohort_names_f <- c("P_0A4_F",
"P_5A9_F",
"P_10A14_F",
"P_15A19_F",
"P_20A24_F",
"P_25A29_F",
"P_30A34_F",
"P_35A39_F",
"P_40A44_F",
"P_45A49_F",
"P_50A54_F",
"P_55A59_F",
"P_60A64_F",
"P_65A69_F",
"P_70A74_F",
"P_75A79_F",
"P_80A84_F",
"P_85YMAS_F")
municipality <- "Acajete"
locality <- "San Javier"
pueb_norm_filt <- pueb_norm |>
filter(NOM_MUN == municipality, NOM_LOC == locality)
cohort_counts_m <- as.numeric(pueb_norm_filt[1,cohort_names_m])
cohort_counts_f <- as.numeric(pueb_norm_filt[1,cohort_names_f])
data <- tibble(
Cohort = c(cohort_names_m, cohort_names_f),
Count = c(cohort_counts_m, cohort_counts_f),
Sex = rep(c("Male", "Female"), each = length(cohort_names_m))
)
# Plotting population pyramid
ggplot(data, aes(x = reorder(Cohort, -Count), y = Count, fill = Sex)) +
geom_bar(stat = "identity", position = "identity") +
scale_fill_manual(values = c("blue", "pink")) +
coord_flip() +
labs(title = "Population Pyramid",
x = "Population Count",
y = "Age Cohort",
fill = "Sex") +
theme_minimal()
new_ages <- c("0-4",
"5-9",
"10-14",
"15-19",
"20-24",
"25-29",
"30-34",
"35-39",
"40-44",
"45-49",
"50-54",
"55-59",
"60-64",
"65-69",
"70-74",
"75-79",
"80-84",
"85+")
data <- tibble(
Age = paste0(new_ages),
Male = sample(200:1000, length(cohort_names_m), replace = TRUE),
Female = sample(200:1000, length(cohort_names_f), replace = TRUE)
)
data_long <- pivot_longer(
data,
cols = c(Male, Female),
names_to = "Sex",
values_to = "Population"
)
basic_plot <- ggplot(data_long, aes(x = Age, y = ifelse(Sex == "Male", -Population, Population), fill = Sex)) +
geom_bar(stat = "identity") +
scale_y_continuous(labels = abs, limits = max(data_long$Population) * c(-1, 1)) +
coord_flip() +
theme_minimal() +
labs(x = "Age", y = "Population", fill = "Sex", title = "Population Pyramid")
basic_plot
census_dataset <- open_dataset(here("data", "processed", "parquet_data_coords"))
pueb_norm <- census_dataset |>
filter(NOM_ENT=="Puebla") |>
collect()
Warning: Invalid metadata$rWarning: Invalid metadata$rWarning: Invalid metadata$rWarning: Invalid metadata$r
municipality <- "Acajete"
locality <- "San Javier"
pueb_norm_filt <- pueb_norm |>
filter(NOM_MUN == municipality, NOM_LOC == locality)
cohort_names_m <- c("P_0A4_M",
"P_5A9_M",
"P_10A14_M",
"P_15A19_M",
"P_20A24_M",
"P_25A29_M",
"P_30A34_M",
"P_35A39_M",
"P_40A44_M",
"P_45A49_M",
"P_50A54_M",
"P_55A59_M",
"P_60A64_M",
"P_65A69_M",
"P_70A74_M",
"P_75A79_M",
"P_80A84_M",
"P_85YMAS_M")
cohort_names_f <- c("P_0A4_F",
"P_5A9_F",
"P_10A14_F",
"P_15A19_F",
"P_20A24_F",
"P_25A29_F",
"P_30A34_F",
"P_35A39_F",
"P_40A44_F",
"P_45A49_F",
"P_50A54_F",
"P_55A59_F",
"P_60A64_F",
"P_65A69_F",
"P_70A74_F",
"P_75A79_F",
"P_80A84_F",
"P_85YMAS_F")
new_ages <- c("0-4",
"5-9",
"10-14",
"15-19",
"20-24",
"25-29",
"30-34",
"35-39",
"40-44",
"45-49",
"50-54",
"55-59",
"60-64",
"65-69",
"70-74",
"75-79",
"80-84",
"85+")
data <- tibble(
Age = paste0(new_ages),
Male = as.numeric(pueb_norm_filt[1,cohort_names_m]),
Female = as.numeric(pueb_norm_filt[1,cohort_names_f])
)
data_long <- pivot_longer(
data,
cols = c(Male, Female),
names_to = "Sex",
values_to = "Population"
)
basic_plot <- ggplot(data_long, aes(x = Age, y = ifelse(Sex == "Male", -Population, Population), fill = Sex)) +
geom_bar(stat = "identity") +
scale_y_continuous(labels = abs, limits = max(data_long$Population) * c(-1, 1)) +
coord_flip() +
theme_minimal() +
labs(x = "Age", y = "Population", fill = "Sex", title = "Population Pyramid")
basic_plot